from datasets import load_metric


class EvaluateTool(object):
    def __init__(self, args):
        self.args = args
        self.bleu = load_metric('bleu')

    def evaluate(self, preds, golds, section):
        preds = [pred.split(' ') for pred in preds]
        golds = [[gold['utterance'].split(' ')] for gold in golds]

        bleu_4 = self.bleu.compute(predictions=preds, references=golds,
                                   max_order=4)
        bleu_2 = self.bleu.compute(predictions=preds, references=golds,
                                   max_order=2)

        return {'bleu-4': bleu_4['bleu'], 'bleu-2': bleu_2['bleu']}, bleu_4['bleu']
